{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a4e56f9c-7f32-4750-99b7-47a284acbc4d",
   "metadata": {},
   "source": [
    "# 模型转换为ONNX格式\r\n",
    "OpenVINO可用于从Hugging Face Hub加载优化模型，并创建管道以使用Hugging FaceAPI通过OpenVINO Runtime运行推理。这意味着我们只需要将AutoModelForXxx类替换为相应的OVModelForXxx类。就能实现模型格式的转换。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "3190be4d-4c3f-41e3-9637-634779909740",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/egcs/anaconda3/envs/openvino/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations\n",
      "  warnings.warn(\n",
      "Framework not specified. Using pt to export to ONNX.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e75d05e29a6f474a80b9224c86f3bd40",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/egcs/anaconda3/envs/openvino/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:362: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.9` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.\n",
      "  warnings.warn(\n",
      "/home/egcs/anaconda3/envs/openvino/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:367: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.6` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.\n",
      "  warnings.warn(\n",
      "/home/egcs/anaconda3/envs/openvino/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:508: UserWarning: The generation config instance is invalid -- `.validate()` throws warnings and/or exceptions. Fix these issues to save the configuration. This warning will be raised to an exception in v4.34.\n",
      "\n",
      "Thrown during validation:\n",
      "`do_sample` is set to `False`. However, `temperature` is set to `0.9` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.\n",
      "  warnings.warn(\n",
      "Using pad_token, but it is not set yet.\n",
      "Using pad_token, but it is not set yet.\n",
      "Using pad_token, but it is not set yet.\n",
      "Using pad_token, but it is not set yet.\n",
      "Using framework PyTorch: 2.0.1+cu117\n",
      "Overriding 1 configuration item(s)\n",
      "\t- use_cache -> True\n",
      "Asked a sequence length of 16, but a sequence length of 1 will be used with use_past == True for `input_ids`.\n",
      "/home/egcs/anaconda3/envs/openvino/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:146: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
      "  if seq_len > self.max_seq_len_cached:\n",
      "/home/egcs/anaconda3/envs/openvino/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:375: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
      "  if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):\n",
      "/home/egcs/anaconda3/envs/openvino/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:382: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
      "  if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):\n",
      "/home/egcs/anaconda3/envs/openvino/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:392: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
      "  if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):\n"
     ]
    }
   ],
   "source": [
    "from optimum.intel import OVQuantizer\n",
    "from optimum.intel.openvino import OVModelForCausalLM\n",
    "\n",
    "model_dir = \"llama-2-chat-7b/ov_model\"\n",
    "pt_model_id = 'llama-2-chat-7b'\n",
    "\n",
    "ov_model = OVModelForCausalLM.from_pretrained(pt_model_id, export=True, compile=False)\n",
    "ov_model.half()\n",
    "ov_model.save_pretrained(model_dir)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2f7fc35e-09fa-4c6d-8fa1-6a4791158301",
   "metadata": {},
   "source": [
    "# CPU推理\n",
    "指定其部署推理的设备为CPU，让模型在intel的CPU上进行推理。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "1a01497e-0207-494d-bd75-0cc894f90e3c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The argument `trust_remote_code` is to be used along with export=True. It will be ignored.\n",
      "Compiling the model to CPU ...\n"
     ]
    }
   ],
   "source": [
    "from optimum.intel.openvino import OVModelForCausalLM\n",
    "from transformers import AutoTokenizer\n",
    "from transformers import AutoConfig\n",
    "\n",
    "\n",
    "model_dir = \"llama-2-chat-7b/ov_model\"\n",
    "model_name = 'llama-2-chat-7b'\n",
    "\n",
    "ov_config = {'PERFORMANCE_HINT': 'LATENCY', 'NUM_STREAMS': '1', \"CACHE_DIR\": \"\"}\n",
    "tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
    "\n",
    "ov_model = OVModelForCausalLM.from_pretrained(model_dir, device='cpu', ov_config=ov_config, config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True), trust_remote_code=True)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e952d16d-21ec-4fbb-9ed6-97f922b1fcff",
   "metadata": {},
   "source": [
    "## 提供网络接口方式\n",
    " langchain 默认的模型是 OpenAI的ChatGPT。对于局域网应用来说，因为信息安全的要求私有数据不能出网关，所以需要搭本地模型。其实整个应用的硬件成本最高的就是 LLM 的部署，最经济的方式就是一个局域网一个类型的 LLM 统一部署一个，为了保障硬件的充分利用。LLM 和 Langchain 分开部署的最大好处就是灵活性，其实Langchain 已经是一个非常棒的设计样板了，langchain 只做资源整合，任何重存储和重计算的服务全部在远端部署，给 langchain 的应用留足生长的空间。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "b913378f-c38e-43c0-9dec-109392580315",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import (\n",
    "    AutoTokenizer,\n",
    "    TextIteratorStreamer,\n",
    ")\n",
    "\n",
    "def llama_partial_text_processor(partial_text, new_text):\n",
    "    new_text = new_text.replace(\"[INST]\", \"\").replace(\"[/INST]\", \"\")\n",
    "    partial_text += new_text\n",
    "    return partial_text\n",
    "\n",
    "\n",
    "def fun(query:str):\n",
    "    start='''<s>[INST] <<SYS>>\n",
    "    你是一个乐于助人、尊重他人、诚实的助手。在安全的情况下，始终尽可能提供帮助。请用中文回答\n",
    "    <</SYS>>\n",
    "    '''\n",
    "    end= \"[/INST]\"\n",
    "    string=start+query+end\n",
    "    input_tokens = tok(string, return_tensors=\"pt\", add_special_tokens=False)\n",
    "    streamer = TextIteratorStreamer(tok, timeout=30.0, skip_prompt=True, skip_special_tokens=True)\n",
    "    ov_model.generate(**input_tokens, max_new_tokens=256,streamer=streamer)\n",
    "    partial_text = \"\"\n",
    "    for new_text in streamer:\n",
    "        partial_text = llama_partial_text_processor(partial_text, new_text)\n",
    "    result=partial_text.replace(\" \", \"\")\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9896a976-34ad-4224-aacb-1c84426dc0fd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " * Serving Flask app '__main__'\n",
      " * Debug mode: off\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[31m\u001b[1mWARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.\u001b[0m\n",
      " * Running on http://127.0.0.1:8080\n",
      "\u001b[33mPress CTRL+C to quit\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "from flask import Flask, request\n",
    "app = Flask(__name__)\n",
    "\n",
    "@app.route('/', methods=['GET', 'POST'])\n",
    "def index():\n",
    "    if request.method == 'POST':\n",
    "        query = request.form.get('ask')\n",
    "        print(query)\n",
    "        result=fun(query)\n",
    "        print(result)\n",
    "        return result\n",
    "    else:\n",
    "        return 'Hello, GET!'\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    app.run(host='127.0.0.1',port=8080)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b7edb7d3-bddd-40e7-b887-b80a59b56aad",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
